# Library -----------------------------------------------------------------
library(tidyverse)
library(pROC)
library(foreach) # For parallel looping
library(doParallel)
library(ggpubr)
library(rlist)
set.seed(123)
# lists to save results for final comparisons
## model coefficients
list_coefficients <- list()
## calibration plots
list_calibration_plots <- list()
## ROC curve with AUC
list_ROC <- list()
## ECE
list_ECE <- list()
## Brier score
list_Brier <- list()
# Functions ---------------------------------------------------------------
expit <- function(x) {
exp(x) / (1 + exp(x))
}
logit <- function(x) {
log(x / (1 - x))
}
simDat <- function(N, beta0, beta1, beta2, beta12) {
# Gaussian covariates
X1 <- rnorm(n = N, mean = 0, sd = 1)
X2 <- rnorm(n = N, mean = 0, sd = 1)
# linear predictor
lin_pred <- beta0 + X1 * beta1 + X2 * beta2 + X1 * X2 * beta12
# outcome
Y <- rbinom(n = N, size = 1, prob = expit(lin_pred))
list(X = cbind(X1, X2),
lin_pred = lin_pred,
Y = Y
) %>%
return()
}
# calibration plot
calibrationPlot <- function(data, caption) {
ggplot(data = data,
aes(x = pred,
y = Y)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "gam",
formula = y ~ s(x, bs = "cs")) +
theme_bw() +
labs(x = "Predicted probabilities",
y = "Observation",
caption = caption)
}
ROCplot <- function(data, caption) {
roc_temp <- roc(data$Y, data$pred)
ggplot(data = data.frame(sens = roc_temp$sensitivities,
spec = roc_temp$specificities),
aes(y = sens,
x = 1 - spec)) +
geom_line() +
theme_bw() +
labs(y = "Sensitivity",
x = "1 - Specificity",
title = paste("AUC = ", round(roc_temp$auc, 2)),
caption = caption) +
geom_abline(aes(intercept = 0,
slope = 1),
linetype = "dashed",
color = "grey") %>%
return()
}
ECE <- function(obs, pred) {
mean(abs(obs - pred))
}
Brier <- function(obs, pred) {
mean((obs - pred)^2)
}
predict_for_beta <- function(data, beta) {
# design matrix
Xdesign <- data.frame(Intercept = 1,
X1 = data$X1,
X2 = data$X2,
Interaction = data$X1 * data$X2) %>%
as.matrix()
# linear predictor
lin_pred <- Xdesign %*% beta
# prediction
expit(lin_pred)[, 1] %>%
return()
}
# function to optimize AUC
AUC_optimization <- function(data, step,
grid_search,
Xdesign) {
beta <- grid_search[step, ]
# compute prediction
lin_pred <- Xdesign %*% beta
pred <- expit(lin_pred)[, 1]
# estimate AUC
roc(data$Y, pred)$auc %>%
return()
}
ECE_optimization <- function(data, step,
grid_search,
Xdesign) {
beta <- grid_search[step, ]
# compute prediction
lin_pred <- Xdesign %*% beta
pred <- expit(lin_pred)[, 1]
# estimate AUC
ECE(obs = data$Y, pred = pred) %>%
return()
}
Brier_optimization <- function(data, step,
grid_search,
Xdesign) {
beta <- grid_search[step, ]
# compute prediction
lin_pred <- Xdesign %*% beta
pred <- expit(lin_pred)[, 1]
# estimate AUC
Brier(obs = data$Y, pred = pred) %>%
return()
}
# load saved R workspace
load("reproduce.RData")
# Simulation --------------------------------------------------------------
# true OR coefficients
beta0 <- 1
beta1 <- 2.5
beta2 <- 4
beta12 <- 3
# train_dat <- simDat(N = 10000,
# beta0 = beta0, beta1 = beta1, beta2 = beta2,
# beta12 = beta12)
#
# val_dat <- simDat(N = 1000,
# beta0 = beta0, beta1 = beta1, beta2 = beta2,
# beta12 = beta12)
#
# # transform into data frames
# train_dat_df <- data.frame(Y = train_dat$Y) %>%
# cbind(train_dat$X)
#
# val_dat_df <- data.frame(Y = val_dat$Y) %>%
# cbind(val_dat$X)
# distribution of probabilities
ggplot(data = data.frame(x = expit(train_dat$lin_pred)),
aes(x = x)) +
geom_density() +
theme_bw() +
labs(x = "Probability")
ggplot(data = data.frame(x = expit(train_dat$lin_pred),
y = train_dat$Y),
aes(x = x,
y = y)) +
geom_point() +
theme_bw() +
labs(x = "Probability",
y = "Outcome")
Maximum likelihood estimation is the classical standard approach to estimating coefficients for a logistic regression.
# model fitting according to IWLS
ML_mod <- glm(Y ~ 1 + X1 + X2 + X1*X2,
data = train_dat_df,
family = "binomial")
# model coefficients
ML_mod_coef <- summary(ML_mod)$coef %>%
as.data.frame() %>%
mutate(True = c(beta0, beta1, beta2 , beta12),
Coef = rownames(summary(ML_mod)$coef))
list_coefficients[["ML_CMS"]] <- ggplot(data = ML_mod_coef,
aes(x = Coef,
y = Estimate)) +
geom_point() +
geom_errorbar(aes(ymin = Estimate - 1.96 * `Std. Error`,
ymax = Estimate + 1.96 * `Std. Error`)) +
geom_point(aes(y = True,
color = "True coefficient"),
shape = 4, size = 3) +
theme_bw() +
theme(legend.position = "bottom") +
labs(x = "Model coefficient",
color = "",
caption = "ML estimation, correct") +
coord_flip()
AUC_train_ML <- roc(train_dat_df$Y, ML_mod$fitted.values)$auc
# predict probabilities for validation data
val_dat_df$pred <- predict(ML_mod,
newdata = val_dat_df,
type = "response")
# calibration plot
list_calibration_plots[["ML_CMS"]] <- calibrationPlot(data = val_dat_df,
caption = "ML estimation, correct")
# ROC
list_ROC[["ML_CMS"]] <- ROCplot(data = val_dat_df,
caption = "ML estimation, correct")
# ECE / Brier score
list_ECE[["ML_CMS"]] <- ECE(val_dat_df$Y, val_dat_df$pred)
list_Brier[["ML_CMS"]] <- Brier(val_dat_df$Y, val_dat_df$pred)
# model fitting according to IWLS
ML_mod <- glm(Y ~ 1 + X1 + X2,
data = train_dat_df,
family = "binomial")
# model coefficients
ML_mod_coef <- summary(ML_mod)$coef %>%
as.data.frame() %>%
mutate(True = c(beta0, beta1, beta2),
Coef = rownames(summary(ML_mod)$coef))
list_coefficients[["ML_ICMS"]] <- ggplot(data = ML_mod_coef,
aes(x = Coef,
y = Estimate)) +
geom_point() +
geom_errorbar(aes(ymin = Estimate - 1.96 * `Std. Error`,
ymax = Estimate + 1.96 * `Std. Error`)) +
geom_point(aes(y = True,
color = "True coefficient"),
shape = 4, size = 3) +
theme_bw() +
theme(legend.position = "bottom") +
labs(x = "Model coefficient",
color = "",
caption = "ML estimation, incorrect") +
coord_flip()
# predict probabilities for validation data
val_dat_df$pred <- predict(ML_mod,
newdata = val_dat_df,
type = "response")
# calibration plot
list_calibration_plots[["ML_ICMS"]] <- calibrationPlot(data = val_dat_df,
caption = "ML estimation, incorrect")
# ROC
list_ROC[["ML_ICMS"]] <- ROCplot(data = val_dat_df,
caption = "ML estimation, incorrect")
# ECE / Brier score
list_ECE[["ML_ICMS"]] <- ECE(val_dat_df$Y, val_dat_df$pred)
list_Brier[["ML_ICMS"]] <- Brier(val_dat_df$Y, val_dat_df$pred)
# Set up for brute force grid search
# 4 dimensional space for searching
grid_search <- expand.grid(beta0 = seq(-3, 3, by = 0.5),
beta1 = seq(-1, 3.5, by = 0.1),
beta2 = seq(2, 5, by = 0.1),
beta12 = seq(2, 5, by = 0.1)) %>%
as.matrix()
Xdesign <- data.frame(Intercept = 1,
X1 = train_dat_df$X1,
X2 = train_dat_df$X2,
Interaction = train_dat_df$X1 * train_dat_df$X2) %>%
as.matrix()
Intercept coefficient redundant, as all it does is scale the probability. It is not identifiable based on a discrimination metric.
# find best performing AUC based on brute-force grid search
n_steps <- nrow(grid_search)
# Setup parallel backend (adjust 'cores' based on your system)
# cores <- detectCores() - 1 # Leave one core free
#
# cl <- makeCluster(cores)
# registerDoParallel(cl)
#
# # Perform grid search with parallelization
# auc_results <- foreach(step = 1:n_steps,
# .combine = rbind,
# .packages = c("pROC", "dplyr")) %dopar% {
# auc_value <- AUC_optimization(data = train_dat_df,
# step = step,
# grid_search = grid_search,
# Xdesign = Xdesign)
# c(step, auc_value) # Return step and corresponding AUC value
# }
#
# # Stop the parallel backend
# stopCluster(cl)
AUC_results_df <- data.frame(step = auc_results[, 1],
AUC = auc_results[, 2]) %>%
mutate(beta0 = grid_search[step, 1],
beta1 = grid_search[step, 2],
beta2 = grid_search[step, 3],
beta12 = grid_search[step, 4])
AUC_results_df_long <- AUC_results_df %>%
gather(coef, value, -c(step, AUC))
# visualize results
ggplot(data = AUC_results_df_long,
aes(x = value,
y = AUC,
color = coef)) +
geom_point() +
geom_hline(aes(yintercept = AUC_train_ML)) +
theme_bw() +
theme(legend.position = "bottom") +
labs(color = "") +
facet_grid(~ coef)
# Find the best parameters (maximizing AUC)
best_AUC <- AUC_results_df %>% filter(AUC == max(AUC))
# best beta
AUC_beta <- c(0, best_AUC$beta1, best_AUC$beta2, best_AUC$beta12) %>%
unique()
# predict for best beta
val_dat_df$pred <- predict_for_beta(data = val_dat_df,
beta = AUC_beta)
# calibration plot
list_calibration_plots[["AUC"]] <- calibrationPlot(data = val_dat_df,
caption = "AUC optimization")
# ROC
list_ROC[["AUC"]] <- ROCplot(data = val_dat_df,
caption = "AUC optimization")
# ECE / Brier score
list_ECE[["AUC"]] <- ECE(val_dat_df$Y, val_dat_df$pred)
list_Brier[["AUC"]] <- Brier(val_dat_df$Y, val_dat_df$pred)
# cl <- makeCluster(cores)
# registerDoParallel(cl)
#
# # Perform grid search with parallelization
# ece_results <- foreach(step = 1:n_steps,
# .combine = rbind,
# .packages = c("dplyr")) %dopar% {
# ece_value <- ECE_optimization(data = train_dat_df,
# step = step,
# grid_search = grid_search,
# Xdesign = Xdesign)
# c(step, ece_value) # Return step and corresponding AUC value
# }
#
# # Stop the parallel backend
# stopCluster(cl)
ECE_results_df <- data.frame(step = ece_results[, 1],
ECE = ece_results[, 2]) %>%
mutate(beta0 = grid_search[step, 1],
beta1 = grid_search[step, 2],
beta2 = grid_search[step, 3],
beta12 = grid_search[step, 4])
ECE_results_df_long <- ECE_results_df %>%
gather(coef, value, -c(step, ECE))
# visualize results
ggplot(data = ECE_results_df_long,
aes(x = value,
y = ECE,
color = coef)) +
geom_point() +
geom_hline(aes(yintercept = AUC_train_ML)) +
theme_bw() +
theme(legend.position = "bottom") +
labs(color = "") +
facet_grid(~ coef)
# Find the best parameters (maximizing AUC)
best_ECE <- ECE_results_df %>% filter(ECE == min(ECE))
# best beta
ECE_beta <- c(best_ECE$beta0, best_ECE$beta1, best_ECE$beta2, best_ECE$beta12)
# predict for best beta
val_dat_df$pred <- predict_for_beta(data = val_dat_df,
beta = ECE_beta)
# calibration plot
list_calibration_plots[["ECE"]] <- calibrationPlot(data = val_dat_df,
caption = "ECE optimization")
# ROC
list_ROC[["ECE"]] <- ROCplot(data = val_dat_df,
caption = "ECE optimization")
# ECE / Brier score
list_ECE[["ECE"]] <- ECE(val_dat_df$Y, val_dat_df$pred)
list_Brier[["ECE"]] <- Brier(val_dat_df$Y, val_dat_df$pred)
# cl <- makeCluster(cores)
# registerDoParallel(cl)
#
# # Perform grid search with parallelization
# brier_results <- foreach(step = 1:n_steps,
# .combine = rbind,
# .packages = c("dplyr")) %dopar% {
# brier_value <- Brier_optimization(data = train_dat_df,
# step = step,
# grid_search = grid_search,
# Xdesign = Xdesign)
# c(step, brier_value) # Return step and corresponding AUC value
# }
#
# # Stop the parallel backend
# stopCluster(cl)
Brier_results_df <- data.frame(step = brier_results[, 1],
Brier = brier_results[, 2]) %>%
mutate(beta0 = grid_search[step, 1],
beta1 = grid_search[step, 2],
beta2 = grid_search[step, 3],
beta12 = grid_search[step, 4])
Brier_results_df_long <- Brier_results_df %>%
gather(coef, value, -c(step, Brier))
# visualize results
ggplot(data = Brier_results_df_long,
aes(x = value,
y = Brier,
color = coef)) +
geom_point() +
geom_hline(aes(yintercept = AUC_train_ML)) +
theme_bw() +
theme(legend.position = "bottom") +
labs(color = "") +
facet_grid(~ coef)
# Find the best parameters (maximizing AUC)
best_Brier <- Brier_results_df %>% filter(Brier == min(Brier))
# best beta
Brier_beta <- c(best_Brier$beta0, best_Brier$beta1, best_Brier$beta2, best_Brier$beta12)
# predict for best beta
val_dat_df$pred <- predict_for_beta(data = val_dat_df,
beta = Brier_beta)
# calibration plot
list_calibration_plots[["Brier"]] <- calibrationPlot(data = val_dat_df,
caption = "Brier score optimization")
# ROC
list_ROC[["Brier"]] <- ROCplot(data = val_dat_df,
caption = "Brier score optimization")
# ECE / Brier score
list_ECE[["Brier"]] <- ECE(val_dat_df$Y, val_dat_df$pred)
list_Brier[["Brier"]] <- Brier(val_dat_df$Y, val_dat_df$pred)
# coefficient estimates
coef_results <- data.frame(AUC = AUC_beta,
ECE = ECE_beta,
Brier = Brier_beta,
True = c(beta0, beta1, beta2, beta12),
Coef = c("(Intercept)", "X1", "X2", "X1:X2")) %>%
gather(Method, Estimate, -c(Coef, True))
list_coefficients[["AUC_ECE_Brier"]] <- ggplot(data = coef_results,
aes(x = Coef,
y = Estimate,
color = Method)) +
geom_point(position = position_jitter(width = 0.2, height = 0)) +
geom_point(aes(y = True,
color = "True coefficient"),
shape = 4, size = 3) +
theme_bw() +
theme(legend.position = "bottom") +
labs(x = "Model coefficient",
color = "",
caption = "AUC / Brier / ECE") +
coord_flip()
Based on validation data set.
ggarrange(list_coefficients$ML_CMS,
list_coefficients$ML_ICMS,
list_coefficients$AUC_ECE_Brier,
nrow = 2, ncol = 2)
# calibration plots
ggarrange(list_calibration_plots$ML_CMS,
list_calibration_plots$ML_ICMS,
list_calibration_plots$AUC,
list_calibration_plots$ECE,
list_calibration_plots$Brier,
nrow = 2, ncol = 3)
# AUC
ggarrange(list_ROC$ML_CMS,
list_ROC$ML_ICMS,
list_ROC$AUC,
list_ROC$ECE,
list_ROC$Brier)
# ECE
list_ECE %>% list.cbind() %>% knitr::kable()
| ML_CMS | ML_ICMS | AUC | ECE | Brier |
|---|---|---|---|---|
| 0.1808541 | 0.2429615 | 0.177698 | 0.1624228 | 0.1817578 |
# Brier score
list_Brier %>% list.cbind() %>% knitr::kable()
| ML_CMS | ML_ICMS | AUC | ECE | Brier |
|---|---|---|---|---|
| 0.0941789 | 0.1187399 | 0.1074768 | 0.0988773 | 0.0940754 |
# save(list_calibration_plots, file = "list_calibration_plots.RData")
# save(list_ROC, file = "list_ROC.RData")
# save(list_ECE, file = "list_ECE.RData")
# save(list_Brier, file = "list_Brier.RData")
# save entire workspace
# save.image(file = "reproduce.RData")